{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 354,
   "metadata": {},
   "outputs": [],
   "source": [
    "import warnings\n",
    "from tqdm import tqdm\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import xgboost as xgb\n",
    "import lightgbm as lgb\n",
    "import catboost as cab\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "from sklearn.metrics import classification_report, f1_score\n",
    "from sklearn.model_selection import StratifiedKFold, KFold\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.metrics import f1_score,precision_recall_fscore_support,roc_curve,auc,roc_auc_score\n",
    "from sklearn.model_selection import GridSearchCV\n",
    "from matplotlib import pyplot as plt\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "import pandas as pd\n",
    "import gc\n",
    "from sklearn import feature_extraction\n",
    "from sklearn.feature_extraction.text import TfidfTransformer\n",
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "\n",
    "#from featexp import get_univariate_plots#用于特征筛选，需要先安装featexp\n",
    "warnings.filterwarnings(\"ignore\")\n",
    "plt.rcParams['font.sans-serif']=['Simhei']\n",
    "plt.rcParams['axes.unicode_minus']=False\n",
    "import json\n",
    "import jieba\n",
    "import fasttext"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 0 数据的简单分析"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 355,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "base_info shape: (24865, 33) id unique: 24865\n",
      "annual_report_info shape: (22550, 23) id unique: 8937\n",
      "tax_info shape: (29195, 9) id unique: 808\n",
      "change_info shape: (45940, 5) id unique: 8726\n",
      "news_info shape: (10518, 3) id unique: 927\n",
      "other_info shape: (1890, 4) id unique: 1888\n",
      "entprise_info shape: (14865, 2) id unique: 14865\n",
      "entprise_evaluate shape: (10000, 2) id unique: 10000\n"
     ]
    }
   ],
   "source": [
    "base_info=pd.read_csv('train/base_info.csv')#企业的基本信息\n",
    "annual_report_info=pd.read_csv('train/annual_report_info.csv')#企业的年报基本信息\n",
    "tax_info=pd.read_csv('train/tax_info.csv')#企业的纳税信息\n",
    "change_info=pd.read_csv('train/change_info.csv')#变更信息\n",
    "news_info=pd.read_csv('train/news_info.csv')#舆情信息\n",
    "other_info=pd.read_csv('train/other_info.csv')#其它信息\n",
    "entprise_info=pd.read_csv('train/entprise_info.csv')#企业标注信息{0: 13884, 1: 981}\n",
    "entprise_evaluate=pd.read_csv('entprise_evaluate.csv')#未标注信息\n",
    "\n",
    "print('base_info shape:',base_info.shape,'id unique:',len(base_info['id'].unique()))\n",
    "print('annual_report_info shape:',annual_report_info.shape,'id unique:',len(annual_report_info['id'].unique()))\n",
    "print('tax_info shape:',tax_info.shape,'id unique:',len(tax_info['id'].unique()))\n",
    "print('change_info shape:',change_info.shape,'id unique:',len(change_info['id'].unique()))\n",
    "print('news_info shape:',news_info.shape,'id unique:',len(news_info['id'].unique()))\n",
    "print('other_info shape:',other_info.shape,'id unique:',len(other_info['id'].unique()))\n",
    "print('entprise_info shape:',entprise_info.shape,'id unique:',len(entprise_info['id'].unique()))\n",
    "print('entprise_evaluate shape:',entprise_evaluate.shape,'id unique:',len(entprise_evaluate['id'].unique()))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1 特征构建 \n",
    "###  tfidi处理经营范围(opscope)特征"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 356,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "对opscope提取tfidif特征完毕..........\n"
     ]
    }
   ],
   "source": [
    "# tfidif 处理经营范围的特征\n",
    "#cn_stopwords.txt来源于 https://github.com/goto456/stopwords\n",
    "def stopwordslist():\n",
    "    stopwords = [line.strip() for line in open('D:/tianma/stopwords-master/cn_stopwords.txt',encoding='UTF-8').readlines()]\n",
    "    return stopwords\n",
    "# 创建一个停用词列表\n",
    "stopwords = stopwordslist()\n",
    "stopwords+=['、', '；', '，', '）','（']\n",
    "#\n",
    "train_df_scope=base_info.merge(entprise_info)[['id','opscope','label']]\n",
    "test_df_scope=base_info[base_info['id'].isin(entprise_evaluate['id'].unique().tolist())]\n",
    "test_df_scope=test_df_scope.reset_index(drop=True)[['id','opscope']]\n",
    "str_label_0=''\n",
    "str_label_1=''\n",
    "for index,name,opscope,label in train_df_scope.itertuples():\n",
    "    # 结巴分词\n",
    "    seg_text = jieba.cut(opscope.replace(\"\\t\", \" \").replace(\"\\n\", \" \"))\n",
    "    outline = \" \".join(seg_text)\n",
    "    out_str=\"\"\n",
    "    for per in outline.split():\n",
    "        if per not in stopwords: \n",
    "            out_str += per\n",
    "            out_str+=\" \"\n",
    "    if label==0:\n",
    "        str_label_0+=out_str\n",
    "    else:\n",
    "        str_label_1+=out_str\n",
    "corpus=[str_label_0,str_label_1]\n",
    "vectorizer=CountVectorizer()#该类会将文本中的词语转换为词频矩阵，矩阵元素a[i][j] 表示j词在i类文本下的词频\n",
    "transformer=TfidfTransformer()#该类会统计每个词语的tf-idf权值\n",
    "tfidf=transformer.fit_transform(vectorizer.fit_transform(corpus))#第一个fit_transform是计算tf-idf，第二个fit_transform是将文本转为词频矩阵\n",
    "word=vectorizer.get_feature_names()#获取词袋模型中的所有词语总共7175个词语\n",
    "weight=tfidf.toarray()#将(2, 7175)tf-idf矩阵抽取出来，元素a[i][j]表示j词在i类文本中的tf-idf权重\n",
    "# for i in range(len(weight)):#打印每类文本的tf-idf词语权重，第一个for遍历所有文本，第二个for便利某一类文本下的词语权重\n",
    "#     #\n",
    "#     for j in range(len(word)):\n",
    "#         print(word[j],weight[i][j])\n",
    "#下面将会根据tfidi算出来的权重将经营范围的文本特征转换为数值(利用weight[1,:]也即各个词语在第二类(违法类中所占据的权重之和))\n",
    "illegal_word_weights={}\n",
    "for i in range(len(word)):\n",
    "    illegal_word_weights[word[i]]=weight[1][i]\n",
    "tfidi_opscope=[]\n",
    "for index,name,opscope in base_info[['id','opscope']].itertuples():\n",
    "    # \n",
    "    seg_text = jieba.cut(opscope.replace(\"\\t\", \" \").replace(\"\\n\", \" \"))\n",
    "    outline = \" \".join(seg_text)\n",
    "    tfidi_frt=0\n",
    "    for per in outline.split():\n",
    "        if per in illegal_word_weights: \n",
    "            tfidi_frt+=illegal_word_weights[per]\n",
    "    tfidi_opscope.append(tfidi_frt)\n",
    "base_info['tfidif_opscope']=tfidi_opscope\n",
    "print('对opscope提取tfidif特征完毕..........')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##  change_info、other_info，news_info，annual_report_info,tax表格的简单特征构建"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 357,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "finished .............\n"
     ]
    }
   ],
   "source": [
    "#change_info\n",
    "change_info_clean=change_info.drop(['bgrq','bgq','bgh'],axis=1)\n",
    "change_info_clean = change_info_clean.groupby('id',sort=False).agg('mean')\n",
    "change_info_clean=pd.DataFrame(change_info_clean).reset_index()\n",
    "#other_info\n",
    "#空值大于0.5的列都删除掉\n",
    "buf_group = other_info.groupby('id',sort=False).agg('mean')\n",
    "other_info_clean=pd.DataFrame(buf_group).reset_index()\n",
    "other_info_clean=other_info_clean.fillna(-1)\n",
    "other_info_clean = other_info_clean.groupby('id',sort=False).agg('mean')\n",
    "other_info_clean=pd.DataFrame(other_info_clean).reset_index()\n",
    "#news_info\n",
    "news_info_clean=news_info.drop(['public_date'],axis=1)\n",
    "#对object类型进行编码\n",
    "news_info_clean['positive_negtive']=news_info_clean['positive_negtive'].fillna(\"中立\")\n",
    "#\n",
    "dic={}\n",
    "cate=news_info_clean.positive_negtive.unique()\n",
    "for i in range(len(cate)):\n",
    "    dic[cate[i]]=i\n",
    "#\n",
    "news_info_clean['positive_negtive']=news_info_clean['positive_negtive'].map(dic)\n",
    "news_info_clean = news_info_clean.groupby('id',sort=False).agg('mean')\n",
    "news_info_clean=pd.DataFrame(news_info_clean).reset_index()\n",
    "#处理annual_report_info的数据\n",
    "#空值大于0.5的列都删除掉\n",
    "annual_report_info_clean=annual_report_info.dropna(thresh=annual_report_info.shape[0]*0.5,how='all',axis=1)\n",
    "#对object类型进行编码\n",
    "annual_report_info_clean['BUSSTNAME']=annual_report_info_clean['BUSSTNAME'].fillna(\"无\")\n",
    "dic = {'无':-1,'开业':0, '歇业':1, '停业':2, '清算':3}\n",
    "#\n",
    "annual_report_info_clean['BUSSTNAME']=annual_report_info_clean['BUSSTNAME'].map(dic)\n",
    "annual_report_info_clean = annual_report_info_clean.groupby('id',sort=False).agg('mean')\n",
    "annual_report_info_clean=pd.DataFrame(annual_report_info_clean).reset_index()\n",
    "#处理tax数据\n",
    "tax_info_clean=tax_info.copy()\n",
    "tax_info_clean['START_DATE']=pd.to_datetime(tax_info_clean['START_DATE'])\n",
    "tax_info_clean['END_DATE']=pd.to_datetime(tax_info_clean['END_DATE'])\n",
    "tax_info_clean['gap_day']=(tax_info_clean['END_DATE']-tax_info_clean['START_DATE']).dt.total_seconds()//3600//24\n",
    "tax_info_clean=tax_info_clean.drop(['START_DATE','END_DATE'],axis=1)\n",
    "tax_info_clean['TAX_CATEGORIES']=tax_info_clean['TAX_CATEGORIES'].fillna(\"无\")#17 unique\n",
    "tax_info_clean['TAX_ITEMS']=tax_info_clean['TAX_ITEMS'].fillna(\"无\")#275 TAX_ITEMS\n",
    "#对object类型进行编码\n",
    "dic={}\n",
    "cate=tax_info_clean.TAX_CATEGORIES.unique()\n",
    "for i in range(len(cate)):\n",
    "    dic[cate[i]]=i\n",
    "tax_info_clean['TAX_CATEGORIES']=tax_info_clean['TAX_CATEGORIES'].map(dic)\n",
    "#\n",
    "dic={}\n",
    "cate=tax_info_clean.TAX_ITEMS.unique()\n",
    "for i in range(len(cate)):\n",
    "    dic[cate[i]]=i\n",
    "tax_info_clean['TAX_ITEMS']=tax_info_clean['TAX_ITEMS'].map(dic)\n",
    "tax_info_clean['income']=tax_info_clean['TAX_AMOUNT']/tax_info_clean['TAX_RATE']\n",
    "#\n",
    "tax_info_clean = tax_info_clean.groupby('id',sort=False).agg('mean')\n",
    "tax_info_clean=pd.DataFrame(tax_info_clean).reset_index()\n",
    "#税额分箱\n",
    "tax_info_clean['TAX_AMOUNT']=tax_info_clean['TAX_AMOUNT'].fillna(tax_info_clean['TAX_AMOUNT'].median())\n",
    "tax_info_clean['bucket_TAX_AMOUNT']=pd.qcut(tax_info_clean['TAX_AMOUNT'], 10, labels=False,duplicates='drop')\n",
    "print('finished .............')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## base_info数据较为重要，需要构建诸多交叉特征以及特征分箱"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 358,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\r",
      "  0%|                                                                                        | 0/24865 [00:00<?, ?it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "编码完毕.................\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|████████████████████████████████████████████████████████████████████████| 24865/24865 [00:00<00:00, 152953.54it/s]\n",
      "100%|████████████████████████████████████████████████████████████████████████| 24865/24865 [00:00<00:00, 160822.85it/s]\n",
      "100%|████████████████████████████████████████████████████████████████████████| 24865/24865 [00:00<00:00, 144950.39it/s]\n",
      "100%|████████████████████████████████████████████████████████████████████████| 24865/24865 [00:00<00:00, 151077.50it/s]\n",
      "100%|████████████████████████████████████████████████████████████████████████| 24865/24865 [00:00<00:00, 169602.08it/s]\n",
      "100%|████████████████████████████████████████████████████████████████████████| 24865/24865 [00:00<00:00, 149671.67it/s]\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "(24865, 42)"
      ]
     },
     "execution_count": 358,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# #处理base_info数据\n",
    "base_info['opto']=pd.to_datetime(base_info['opto']).fillna(pd.to_datetime(base_info['opto']).max())\n",
    "base_info['opfrom']=pd.to_datetime(base_info['opfrom'])\n",
    "base_info['gap_year']=(base_info['opto']-base_info['opfrom']).dt.total_seconds()//3600//24//365\n",
    "base_info_clean=base_info.drop(['opscope','opfrom','opto'],axis=1)\n",
    "\n",
    "#............................对object类型进行编码...............................\n",
    "base_info_clean['industryphy']=base_info_clean['industryphy'].fillna(\"无\")\n",
    "base_info_clean['dom']=base_info_clean['dom'].fillna(\"无\")\n",
    "base_info_clean['opform']=base_info_clean['opform'].fillna(\"无\")\n",
    "base_info_clean['oploc']=base_info_clean['oploc'].fillna(\"无\")\n",
    "#\n",
    "dic={}\n",
    "cate=base_info_clean.industryphy.unique()\n",
    "for i in range(len(cate)):\n",
    "    dic[cate[i]]=i\n",
    "base_info_clean['industryphy']=base_info_clean['industryphy'].map(dic)\n",
    "#\n",
    "dic={}\n",
    "cate=base_info_clean.dom.unique()\n",
    "for i in range(len(cate)):\n",
    "    dic[cate[i]]=i\n",
    "base_info_clean['dom']=base_info_clean['dom'].map(dic)\n",
    "#\n",
    "dic={}\n",
    "cate=base_info_clean.opform.unique()\n",
    "for i in range(len(cate)):\n",
    "    dic[cate[i]]=i\n",
    "base_info_clean['opform']=base_info_clean['opform'].map(dic)\n",
    "#\n",
    "dic={}\n",
    "cate=base_info_clean.oploc.unique()\n",
    "for i in range(len(cate)):\n",
    "    dic[cate[i]]=i\n",
    "base_info_clean['oploc']=base_info_clean['oploc'].map(dic)\n",
    "#\n",
    "base_info_clean=base_info_clean.fillna(-1)\n",
    "#\n",
    "print('编码完毕.................')\n",
    "#........................分箱.................................\n",
    "def bucket(name,bucket_len):\n",
    "    gap_list=[base_info_clean[name].quantile(i/bucket_len) for i in range(bucket_len+1)]#以分位数作为分箱标志\n",
    "    len_data=len(base_info_clean[name])\n",
    "    new_col=[]\n",
    "    for i in base_info_clean[name].values:\n",
    "        for j in range(len(gap_list)):\n",
    "            if gap_list[j]>=i:\n",
    "                encode=j\n",
    "                break\n",
    "        new_col.append(encode)\n",
    "    return new_col\n",
    "#注册资本_实缴资本\n",
    "base_info_clean['regcap_reccap']=base_info_clean['regcap']-base_info_clean['reccap']\n",
    "#注册资本分箱\n",
    "base_info_clean['regcap']=base_info_clean['regcap'].fillna(base_info_clean['regcap'].median())\n",
    "base_info_clean['bucket_regcap']=pd.qcut(base_info_clean['regcap'], 10, labels=False,duplicates='drop')\n",
    "#实缴资本分箱\n",
    "base_info_clean['reccap']=base_info_clean['reccap'].fillna(base_info_clean['reccap'].median())\n",
    "base_info_clean['bucket_reccap']=pd.qcut(base_info_clean['reccap'], 10, labels=False,duplicates='drop')\n",
    "#注册资本_实缴资本分箱\n",
    "base_info_clean['regcap_reccap']=base_info_clean['regcap_reccap'].fillna(base_info_clean['regcap_reccap'].median())\n",
    "base_info_clean['bucket_regcap_reccap']=pd.qcut(base_info_clean['regcap_reccap'], 10, labels=False,duplicates='drop')\n",
    "#.............................交叉.........................\n",
    "#作两个特征的交叉\n",
    "def cross_two(name_1,name_2):\n",
    "    new_col=[]\n",
    "    encode=0\n",
    "    dic={}\n",
    "    val_1=base_info_clean[name_1]\n",
    "    val_2=base_info_clean[name_2]\n",
    "    for i in tqdm(range(len(val_1))):\n",
    "        tmp=str(val_1[i])+'_'+str(val_2[i])\n",
    "        if tmp in dic:\n",
    "            new_col.append(dic[tmp])\n",
    "        else:\n",
    "            dic[tmp]=encode\n",
    "            new_col.append(encode)\n",
    "            encode+=1\n",
    "    return new_col\n",
    "#作企业类型-小类的交叉特征\n",
    "base_info_clean['enttypegb']=base_info_clean['enttypegb'].fillna(\"无\")\n",
    "base_info_clean['enttypeitem']=base_info_clean['enttypeitem'].fillna(\"无\")\n",
    "new_col=cross_two('enttypegb','enttypeitem')#作企业类型-小类的交叉特征\n",
    "base_info_clean['enttypegb_enttypeitem']=new_col\n",
    "#\n",
    "#行业类别-细类的交叉特征\n",
    "base_info_clean['industryphy']=base_info_clean['industryphy'].fillna(\"无\")\n",
    "base_info_clean['industryco']=base_info_clean['industryco'].fillna(\"无\")\n",
    "new_col=cross_two('industryphy','industryco')#作企业类型-小类的交叉特征\n",
    "base_info_clean['industryphy_industryco']=new_col\n",
    "#企业类型-行业类别的交叉特征\n",
    "new_col=cross_two('enttypegb','industryphy')#作企业类型-小类的交叉特征\n",
    "base_info_clean['enttypegb_industryphy']=new_col\n",
    "#行业类别-企业类型小类的交叉特征\n",
    "new_col=cross_two('industryphy','enttypeitem')#作企业类型-小类的交叉特征\n",
    "base_info_clean['industryphy_enttypeitem']=new_col\n",
    "#行业类别细类--企业类型小类的交叉特征\n",
    "new_col=cross_two('industryco','enttypeitem')#作企业类型-小类的交叉特征\n",
    "base_info_clean['industryco_enttypeitem']=new_col\n",
    "\n",
    "#企业类型-小类-行业类别-细类的交叉特征\n",
    "new_col=cross_two('enttypegb_enttypeitem','industryphy_industryco')#作企业类型-小类的交叉特征\n",
    "base_info_clean['enttypegb_enttypeitem_industryphy_industryco']=new_col\n",
    "base_info_clean.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## category特征单独提取出来"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 359,
   "metadata": {},
   "outputs": [],
   "source": [
    "cat_features=['industryphy','dom','opform','oploc','bucket_regcap',\n",
    "              'bucket_reccap','bucket_regcap_reccap',\n",
    "              'enttypegb','enttypeitem','enttypegb_enttypeitem',\n",
    "              'enttypegb_industryphy','enttypegb_enttypeitem_industryphy_industryco',\n",
    "              'industryphy','industryco','industryphy_industryco',\n",
    "              'industryphy_enttypeitem','industryco_enttypeitem',\n",
    "              'adbusign','townsign','regtype','TAX_CATEGORIES','bucket_TAX_AMOUNT',\n",
    "              'legal_judgment_num','brand_num','patent_num','positive_negtive'\n",
    "             ]\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 360,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(24865, 73)"
      ]
     },
     "execution_count": 360,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#\n",
    "all_data=base_info_clean.merge(annual_report_info_clean,how='outer')\n",
    "all_data=all_data.merge(tax_info_clean,how='outer')\n",
    "all_data=all_data.merge(change_info_clean,how='outer')\n",
    "all_data=all_data.merge(news_info_clean,how='outer')\n",
    "all_data=all_data.merge(other_info_clean,how='outer')\n",
    "all_data=all_data.fillna(-1)\n",
    "all_data[cat_features]=all_data[cat_features].astype(int)\n",
    "all_data.shape#,base_info.shape,annual_report_info.shape,tax_info.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 361,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "((14865, 72), (10000, 72))"
      ]
     },
     "execution_count": 361,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#\n",
    "train_df=all_data.merge(entprise_info)\n",
    "train_data=train_df.drop(['id','label'],axis=1)\n",
    "kind=train_df['label']\n",
    "test_df=all_data[all_data['id'].isin(entprise_evaluate['id'].unique().tolist())]\n",
    "test_df=test_df.reset_index(drop=True)\n",
    "test_data=test_df.drop(['id'],axis=1)\n",
    "train_data.shape,test_data.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 362,
   "metadata": {},
   "outputs": [],
   "source": [
    "# # 伪标签\n",
    "# use_pseudo=True\n",
    "# if use_pseudo:\n",
    "#     train_data=train_df.drop(['id','label'],axis=1)\n",
    "#     kind=train_df['label']\n",
    "#     pseudo_name=[]\n",
    "#     pseudo_label={'id':[],'label':[]}\n",
    "#     pseudo_df=pd.read_csv('submit_857_xgb_rf_lgb_cab.csv')\n",
    "#     for index,name,score in pseudo_df.itertuples():\n",
    "#         if score>0.9 or score<0.05:\n",
    "#             pseudo_label['id'].append(name)\n",
    "#             if score>0.9 :\n",
    "#                 pseudo_label['label'].append(1)\n",
    "#             else:\n",
    "#                 pseudo_label['label'].append(0)\n",
    "#             pseudo_name.append(name)\n",
    "#     len(pseudo_name)\n",
    "#     pseudo_data=test_df[test_df.id.isin(pseudo_name)].reset_index(drop=True)\n",
    "#     pseudo_data=pseudo_data.merge(pd.DataFrame(pseudo_label))\n",
    "#     #\n",
    "#     train_df=pd.concat((train_df,pseudo_data)).reset_index(drop=True)\n",
    "#     train_data=train_df.drop(['id','label'],axis=1)\n",
    "#     kind=train_df['label']\n",
    "#     print(train_data.shape,test_data.shape)\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 363,
   "metadata": {},
   "outputs": [],
   "source": [
    "#特征筛选\n",
    "# frt_select=[\n",
    "#  'industryphy',\n",
    "#  'enttypegb',\n",
    "#  'regcap',\n",
    "#  'townsign',\n",
    "#  'industryco',\n",
    "#  'bucket_regcap',\n",
    "#  'empnum',\n",
    "#  'bucket_reccap',\n",
    "#  'enttypeitem',\n",
    "#  'industryphy_industryco',\n",
    "#  'reccap',\n",
    "#  'FORINVESTSIGN',\n",
    "#  'positive_negtive',\n",
    "#  'regtype',\n",
    "#  'STOCKTRANSIGN',\n",
    "#  'bucket_regcap_reccap',\n",
    "#  'enttypegb_enttypeitem',\n",
    "#  'regcap_reccap',\n",
    "#  'legal_judgment_num',\n",
    "#  'TAX_CATEGORIES',\n",
    "#  'TAX_AMOUNT',\n",
    "#  'bgq_bgh',\n",
    "#  'TAX_ITEMS']\n",
    "# frt_select=important_frt[:30]\n",
    "# train_data=train_data[frt_select]\n",
    "# test_data=test_data[frt_select]\n",
    "# cat_features=list(set(frt_select).intersection(set(cat_features)))\n",
    "# cat_features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 364,
   "metadata": {},
   "outputs": [],
   "source": [
    "def eval_score(y_test,y_pre):\n",
    "    _,_,f_class,_=precision_recall_fscore_support(y_true=y_test,y_pred=y_pre,labels=[0,1],average=None)\n",
    "    fper_class={'合法':f_class[0],'违法':f_class[1],'f1':f1_score(y_test,y_pre)}\n",
    "    return fper_class\n",
    "#\n",
    "def k_fold_serachParmaters(model,train_val_data,train_val_kind):\n",
    "    mean_f1=0\n",
    "    mean_f1Train=0\n",
    "    n_splits=5\n",
    "    sk = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=2020)\n",
    "    for train, test in sk.split(train_val_data, train_val_kind):\n",
    "        x_train = train_val_data.iloc[train]\n",
    "        y_train = train_val_kind.iloc[train]\n",
    "        x_test = train_val_data.iloc[test]\n",
    "        y_test = train_val_kind.iloc[test]\n",
    "\n",
    "        model.fit(x_train, y_train)\n",
    "        pred = model.predict(x_test)\n",
    "        fper_class =  eval_score(y_test,pred)\n",
    "        mean_f1+=fper_class['f1']/n_splits\n",
    "        #print(fper_class)\n",
    "        \n",
    "        pred_Train = model.predict(x_train)\n",
    "        fper_class_train =  eval_score(y_train,pred_Train)\n",
    "        mean_f1Train+=fper_class_train['f1']/n_splits\n",
    "    #print('mean valf1:',mean_f1)\n",
    "    #print('mean trainf1:',mean_f1Train)\n",
    "    return mean_f1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 365,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "xlf: 0.8388161665862973\n",
      "llf: 0.8368155931835128\n",
      "clf: 0.8426049545513843\n",
      "rf: 0.8315731221429744\n"
     ]
    }
   ],
   "source": [
    "xlf=xgb.XGBClassifier(max_depth=7\n",
    "                      ,learning_rate=0.05\n",
    "                      ,n_estimators=55\n",
    "                      ,reg_alpha=0.005\n",
    "                      ,n_jobs=8\n",
    "                      ,importance_type='total_cover'\n",
    "                     )\n",
    "#\n",
    "llf=lgb.LGBMClassifier(num_leaves=9\n",
    "                           ,max_depth=5\n",
    "                           ,learning_rate=0.05\n",
    "                           ,n_estimators=80\n",
    "                           ,n_jobs=8\n",
    "                           )\n",
    "  \n",
    "clf=cab.CatBoostClassifier(iterations=60\n",
    "                              ,learning_rate=0.05\n",
    "                              ,depth=10\n",
    "                              ,silent=True\n",
    "                              ,thread_count=8\n",
    "                              ,task_type='CPU'\n",
    "                              ,cat_features=cat_features\n",
    "                              )\n",
    "\n",
    "rf = RandomForestClassifier(oob_score=True, random_state=2020,\n",
    "            n_estimators= 70,max_depth=13,min_samples_split=5)\n",
    "k_fold_serachParmaters(rf,train_data,kind)\n",
    "print('xlf:',k_fold_serachParmaters(xlf,train_data,kind))\n",
    "print('llf:',k_fold_serachParmaters(llf,train_data,kind))\n",
    "print('clf:',k_fold_serachParmaters(clf,train_data,kind)) \n",
    "print('rf:',k_fold_serachParmaters(rf,train_data,kind)) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 366,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "每1次验证的f1:0.8734491315136476\n",
      "每2次验证的f1:0.8678304239401495\n",
      "每3次验证的f1:0.8370927318295739\n",
      "每4次验证的f1:0.8593350383631714\n",
      "每5次验证的f1:0.8320802005012532\n",
      "mean f1: 0.8539575052295592\n"
     ]
    }
   ],
   "source": [
    "# #\n",
    "details = []\n",
    "answers = []\n",
    "mean_f1=0\n",
    "n_splits=5\n",
    "sk = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=2020)\n",
    "cnt=0\n",
    "for train, test in sk.split(train_data, kind):\n",
    "    x_train = train_data.iloc[train]\n",
    "    y_train = kind.iloc[train]\n",
    "    x_test = train_data.iloc[test]\n",
    "    y_test = kind.iloc[test]\n",
    "\n",
    "    xlf.fit(x_train, y_train)\n",
    "    pred_xgb = xlf.predict(x_test)\n",
    "    weight_xgb = eval_score(y_test,pred_xgb)['f1']\n",
    "\n",
    "    llf.fit(x_train, y_train)\n",
    "    pred_llf = llf.predict(x_test)\n",
    "    weight_lgb = eval_score(y_test,pred_llf)['f1']\n",
    "\n",
    "    clf.fit(x_train, y_train)\n",
    "    pred_cab = clf.predict(x_test)\n",
    "    weight_cab =  eval_score(y_test,pred_cab)['f1']\n",
    "\n",
    "    rf.fit(x_train, y_train)\n",
    "    pred_rf = rf.predict(x_test)\n",
    "    weight_rf =  eval_score(y_test,pred_rf)['f1']\n",
    "\n",
    "\n",
    "    prob_xgb = xlf.predict_proba(x_test)\n",
    "    prob_lgb = llf.predict_proba(x_test)\n",
    "    prob_cab = clf.predict_proba(x_test)\n",
    "    prob_rf = rf.predict_proba(x_test)\n",
    "\n",
    "    scores = []\n",
    "    ijkl = []\n",
    "    weight = np.arange(0, 1.05, 0.1)\n",
    "    for i, item1 in enumerate(weight):\n",
    "        for j, item2 in enumerate(weight[weight <= (1 - item1)]):\n",
    "            for k, item3 in enumerate(weight[weight <= (1 - item1-item2)]):\n",
    "                prob_end = prob_xgb * item1 + prob_lgb * item2 + prob_cab *item3+prob_rf*(1 - item1 - item2-item3)\n",
    "                #prob_end = np.sqrt(prob_xgb**2 * item1 + prob_lgb**2 * item2 + prob_cab**2 *item3+prob_rf**2*(1 - item1 - item2-item3))\n",
    "                score = eval_score(y_test,np.argmax(prob_end,axis=1))['f1']\n",
    "                scores.append(score)\n",
    "                ijkl.append((item1, item2,item3, 1 - item1 - item2-item3))\n",
    "\n",
    "    ii = ijkl[np.argmax(scores)][0]\n",
    "    jj = ijkl[np.argmax(scores)][1]\n",
    "    kk = ijkl[np.argmax(scores)][2]\n",
    "    ll = ijkl[np.argmax(scores)][3]\n",
    "\n",
    "    details.append(max(scores))\n",
    "    details.append(weight_xgb)\n",
    "    details.append(weight_lgb)\n",
    "    details.append(weight_cab)\n",
    "    details.append(weight_rf)\n",
    "    details.append(ii)\n",
    "    details.append(jj)\n",
    "    details.append(kk)\n",
    "    details.append(ll)\n",
    "\n",
    "    cnt+=1\n",
    "    print('每{}次验证的f1:{}'.format(cnt,max(scores)))\n",
    "    mean_f1+=max(scores)/n_splits\n",
    "\n",
    "    test_xgb = xlf.predict_proba(test_data)\n",
    "    test_lgb = llf.predict_proba(test_data)\n",
    "    test_cab = clf.predict_proba(test_data)\n",
    "    test_rf = rf.predict_proba(test_data)\n",
    "    #加权平均\n",
    "    ans = test_xgb * ii + test_lgb * jj + test_cab * kk + test_rf*ll#加权平均\n",
    "    #加权平方平均\n",
    "    #ans = np.sqrt(test_xgb**2 * ii + test_lgb**2 * jj + test_cab**2 * kk + test_rf**2*ll)\n",
    "    answers.append(ans)\n",
    "print('mean f1:',mean_f1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 367,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>test_end_score</th>\n",
       "      <th>xgboost</th>\n",
       "      <th>lightgbm</th>\n",
       "      <th>catboost</th>\n",
       "      <th>rf</th>\n",
       "      <th>weight_xgboost</th>\n",
       "      <th>weight_lightgbm</th>\n",
       "      <th>weight_catboost</th>\n",
       "      <th>weight_rf</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.873449</td>\n",
       "      <td>0.853598</td>\n",
       "      <td>0.833333</td>\n",
       "      <td>0.862843</td>\n",
       "      <td>0.822622</td>\n",
       "      <td>0.2</td>\n",
       "      <td>0.1</td>\n",
       "      <td>0.7</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.867830</td>\n",
       "      <td>0.860697</td>\n",
       "      <td>0.865672</td>\n",
       "      <td>0.853598</td>\n",
       "      <td>0.851385</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.7</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.837093</td>\n",
       "      <td>0.824390</td>\n",
       "      <td>0.819753</td>\n",
       "      <td>0.831234</td>\n",
       "      <td>0.822335</td>\n",
       "      <td>0.1</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.7</td>\n",
       "      <td>0.2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.859335</td>\n",
       "      <td>0.841837</td>\n",
       "      <td>0.842377</td>\n",
       "      <td>0.852792</td>\n",
       "      <td>0.833760</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.3</td>\n",
       "      <td>0.5</td>\n",
       "      <td>0.2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.832080</td>\n",
       "      <td>0.813559</td>\n",
       "      <td>0.822943</td>\n",
       "      <td>0.828283</td>\n",
       "      <td>0.827763</td>\n",
       "      <td>0.3</td>\n",
       "      <td>0.1</td>\n",
       "      <td>0.1</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   test_end_score   xgboost  lightgbm  catboost        rf  weight_xgboost  \\\n",
       "0        0.873449  0.853598  0.833333  0.862843  0.822622             0.2   \n",
       "1        0.867830  0.860697  0.865672  0.853598  0.851385             0.0   \n",
       "2        0.837093  0.824390  0.819753  0.831234  0.822335             0.1   \n",
       "3        0.859335  0.841837  0.842377  0.852792  0.833760             0.0   \n",
       "4        0.832080  0.813559  0.822943  0.828283  0.827763             0.3   \n",
       "\n",
       "   weight_lightgbm  weight_catboost  weight_rf  \n",
       "0              0.1              0.7        0.0  \n",
       "1              0.7              0.0        0.3  \n",
       "2              0.0              0.7        0.2  \n",
       "3              0.3              0.5        0.2  \n",
       "4              0.1              0.1        0.5  "
      ]
     },
     "execution_count": 367,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df=pd.DataFrame(np.array(details).reshape(int(len(details)/9),9)\n",
    "                ,columns=['test_end_score','xgboost','lightgbm','catboost','rf'\n",
    "                ,'weight_xgboost','weight_lightgbm','weight_catboost','weight_rf'])\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 368,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "test_end_score     0.853958\n",
       "xgboost            0.838816\n",
       "lightgbm           0.836816\n",
       "catboost           0.845750\n",
       "rf                 0.831573\n",
       "weight_xgboost     0.120000\n",
       "weight_lightgbm    0.240000\n",
       "weight_catboost    0.400000\n",
       "weight_rf          0.240000\n",
       "dtype: float64"
      ]
     },
     "execution_count": 368,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 369,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'submit_853.csv'"
      ]
     },
     "execution_count": 369,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#\n",
    "fina=sum(answers)/n_splits#\n",
    "#fina=np.sqrt(sum(np.array(answers)**2)/n_splits)#平方平均\n",
    "fina=fina[:,1]\n",
    "test_df['score']=fina#可选:fina_persudo是伪标签的预测结果\n",
    "submit_csv=test_df[['id','score']]\n",
    "save_path='submit_'+str(int(mean_f1*1000))+'.csv'\n",
    "submit_csv.to_csv(save_path,index=False)\n",
    "save_path"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 370,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>score</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>9c7fa510616a683058ce97d0bc768a621cd85ab1e87da2a3</td>\n",
       "      <td>0.005466</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>da8691b210adb3f67820f5e0c87b337d63112cee52211888</td>\n",
       "      <td>0.005194</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>9c7fa510616a68309e4badf2a7a3123c0462fb85bf28ef17</td>\n",
       "      <td>0.004856</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>f000950527a6feb6ed308bc4c7ae11276eab86480f8e03db</td>\n",
       "      <td>0.004610</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>f000950527a6feb617e8d6ca7025dcf9d765429969122069</td>\n",
       "      <td>0.005470</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9995</th>\n",
       "      <td>f1c1045b13d18329a2bd99d2a7e2227688c0d69bf1d1e325</td>\n",
       "      <td>0.005234</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9996</th>\n",
       "      <td>f000950527a6feb6bde38216d7cbbf32e66d3a3a96d4dbda</td>\n",
       "      <td>0.542135</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9997</th>\n",
       "      <td>da8691b210adb3f65b43370d3a362f4aa1d3b16b5ba0c9d7</td>\n",
       "      <td>0.004341</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9998</th>\n",
       "      <td>516ab81418ed215dcbbf0614a7b929e691f8eed153d7bb31</td>\n",
       "      <td>0.035893</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9999</th>\n",
       "      <td>9c7fa510616a68303d3427d4bfd4b0cf3e4843f2bf3f637a</td>\n",
       "      <td>0.037939</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>10000 rows × 2 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                    id     score\n",
       "0     9c7fa510616a683058ce97d0bc768a621cd85ab1e87da2a3  0.005466\n",
       "1     da8691b210adb3f67820f5e0c87b337d63112cee52211888  0.005194\n",
       "2     9c7fa510616a68309e4badf2a7a3123c0462fb85bf28ef17  0.004856\n",
       "3     f000950527a6feb6ed308bc4c7ae11276eab86480f8e03db  0.004610\n",
       "4     f000950527a6feb617e8d6ca7025dcf9d765429969122069  0.005470\n",
       "...                                                ...       ...\n",
       "9995  f1c1045b13d18329a2bd99d2a7e2227688c0d69bf1d1e325  0.005234\n",
       "9996  f000950527a6feb6bde38216d7cbbf32e66d3a3a96d4dbda  0.542135\n",
       "9997  da8691b210adb3f65b43370d3a362f4aa1d3b16b5ba0c9d7  0.004341\n",
       "9998  516ab81418ed215dcbbf0614a7b929e691f8eed153d7bb31  0.035893\n",
       "9999  9c7fa510616a68303d3427d4bfd4b0cf3e4843f2bf3f637a  0.037939\n",
       "\n",
       "[10000 rows x 2 columns]"
      ]
     },
     "execution_count": 370,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "submit_csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 143,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "#观察训练/验证过程\n",
    "# df=pd.DataFrame(np.array(details).reshape(int(len(details)/7),7)\n",
    "#                 ,columns=['test_end_score','xgboost','lightgbm','catboost'\n",
    "#                 ,'weight_xgboost','weight_lightgbm','weight_catboost'])\n",
    "# df\n",
    "\n",
    "# df.mean()\n",
    "\n",
    "xlf_impt=xlf.feature_importances_\n",
    "llf_impt=llf.feature_importances_/sum(llf.feature_importances_)\n",
    "clf_impt=clf.feature_importances_/sum(clf.feature_importances_)\n",
    "rf_impt=rf.feature_importances_/sum(rf.feature_importances_)\n",
    "\n",
    "importance=pd.DataFrame({\n",
    "    'column':train_data.columns,\n",
    "    'importance':xlf_impt+llf_impt+clf_impt+rf_impt,\n",
    "}).sort_values(by='importance')\n",
    "importance=importance.reset_index(drop=True)\n",
    "important_frt=list(importance['column'].values)\n",
    "important_frt.reverse()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 216,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['industryco',\n",
       " 'regcap',\n",
       " 'reccap',\n",
       " 'industryphy',\n",
       " 'regcap_reccap',\n",
       " 'TAX_AMOUNT',\n",
       " 'enttypegb_enttypeitem',\n",
       " 'industryphy_enttypeitem',\n",
       " 'tfidi_opscope',\n",
       " 'enttypegb',\n",
       " 'gap_year',\n",
       " 'enttypegb_industryphy',\n",
       " 'enttypeminu',\n",
       " 'industryphy_industryco',\n",
       " 'enttypeitem',\n",
       " 'townsign',\n",
       " 'bucket_regcap',\n",
       " 'bgxmdm',\n",
       " 'empnum',\n",
       " 'positive_negtive',\n",
       " 'bucket_regcap_reccap',\n",
       " 'exenum',\n",
       " 'FORINVESTSIGN',\n",
       " 'oplocdistrict',\n",
       " 'industryco_enttypeitem',\n",
       " 'enttypegb_enttypeitem_industryphy_industryco',\n",
       " 'COLGRANUM',\n",
       " 'bucket_reccap',\n",
       " 'patent_num',\n",
       " 'EMPNUM']"
      ]
     },
     "execution_count": 216,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "important_frt[:30]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "数据集1：base_info.csv\n",
    "包含数据集7和8中涉及到的所有企业的基本信息，每一行代表一个企业的基本数据，每一行有33列，其中id列为企业唯一标识，列之间采用“,”分隔符分割。\n",
    "数据格式如下：\n",
    "[id:企业唯一标识, oplocdistrict:行政区划代码, industryphy:行业类别代码, industryco:行业细类代码, dom:经营地址, opscope:经营范围, enttype:企业类型, enttypeitem:企业类型小类, opfrom:经营期限起, opto:经营期限止, state:状态, orgid:机构标识, jobid:职位标识, adbusign:是否广告经营, townsign:是否城镇, regtype:主题登记类型, empnum:从业人数, compform:组织形式, parnum:合伙人数, exenum:执行人数, opform:经营方式, ptbusscope:兼营范围, venind:风险行业, enttypeminu:企业类型细类, midpreindcode:中西部优势产业代码, protype:项目类型, oploc:经营场所, regcap:注册资本（金）, reccap:实缴资本, forreccap:实缴资本（外方）, forregcap:注册资本（外方）, congro:投资总额, enttypegb:企业（机构）类型]\n",
    "数据集2：annual_report_info.csv\n",
    "包含数据集7和8中涉及到的企业的年报基本信息，每一行代表一个企业的年报基本数据，每一行有23列，其中id列为企业唯一标识，列之间采用“,”分隔符分割。\n",
    "数据格式如下：\n",
    "[id:企业唯一标识, ANCHEYEAR:年度, STATE:状态, FUNDAM:资金数额, MEMNUM:成员人数, FARNUM:农民人数, ANNNEWMEMNUM:本年度新增成员人数, ANNREDMEMNUM:本年度退出成员人数, EMPNUM:从业人数, EMPNUMSIGN:从业人数是否公示, BUSSTNAME:经营状态名称, COLGRANUM:其中高校毕业生人数经营者, RETSOLNUM:其中退役士兵人数经营者, DISPERNUM:其中残疾人人数经营者, UNENUM:其中下岗失业人数经营者, COLEMPLNUM:其中高校毕业生人数雇员, RETEMPLNUM:其中退役士兵人数雇员, DISEMPLNUM:其中残疾人人数雇员, UNEEMPLNUM:其中下岗失业人数雇员, WEBSITSIGN:是否有网站标志, FORINVESTSIGN:是否有对外投资企业标志, STOCKTRANSIGN:有限责任公司本年度是否发生股东股权转让标志, PUBSTATE:公示状态：1 全部公示，2部分公示,3全部不公示]\n",
    "数据集3：tax_info.csv\n",
    "包含数据集7和8中涉及到的企业的纳税信息，每一行代表一个企业的纳税信息，每一行有9列，其中id列为企业唯一标识，列之间采用“,”分隔符分割。\n",
    "数据格式如下：\n",
    "[id:企业唯一标识, START_DATE:起始时间, END_DATE:终止时间, TAX_CATEGORIES:税种, TAX_ITEMS:税目, TAXATION_BASIS:计税依据, TAX_RATE:税率, DEDUCTION:扣除数, TAX_AMOUNT:税额]\n",
    "数据集4：change_info.csv\n",
    "包含数据集7和8中涉及到的企业的变更信息，每一行代表一个企业变更信息，每一行5列，其中id列为企业唯一标识，列之间采用“,”分隔符分割。\n",
    "数据格式如下：\n",
    "[id:企业唯一标识, bgxmdm:变更信息代码, bgq:变更前, bgh:变更后, bgrq:变更日期]\n",
    "数据集5：news_info.csv\n",
    "包含数据集7和8中涉及到的企业的新闻舆情信息，每一行代表一个企业新闻舆情，每一行3列，其中id列为企业唯一标识，列之间采用“,”分隔符分割。\n",
    "数据格式如下：\n",
    "[id:企业唯一标识, positive_negtive:新闻正负面性, public_date:发布日期]\n",
    "数据集6：other_info.csv\n",
    "包含数据集7和8中涉及到的企业的其他信息，每一行代表一个企业其他信息，每一行4列，其中id列为企业唯一标识，列之间采用“,”分隔符分割。\n",
    "数据格式如下：\n",
    "[id:企业唯一标识, legal_judgment_num:裁判文书数量, brand_num:注册商标数量, patent_num:专利数量]\n",
    "数据集7：entprise_info.csv\n",
    "带标注的企业数据。每一行代表一个企业，每一行2列，其中id列为企业唯一标识，label列为标注（1：有非法集资风险，0：无非法集资风险），列之间采用“,”分隔符分割。\n",
    "数据集8（验证集）：entprise_evaluate.csv\n",
    "未标注企业数据。参赛队伍需提交的最终结果数据集，每一行代表一个企业，每一行有 2 列, 其中id列为企业唯一标识，score列为空，列之间采用“,”分隔符分割。"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "huawei_copy",
   "language": "python",
   "name": "huawei_copy"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
